Load required packages

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Load and inspect MVP voting data

mvp_voting <- read_csv('Data/mvp_voting.csv')
## Rows: 719 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): Rank, Player, Tm
## dbl (18): Age, First, Pts Won, Pts Max, Share, G, MP, PTS, TRB, AST, STL, BL...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
mvp_voting

Select only columns pertaining to MVP voting results

mvp_voting <- mvp_voting %>% select(Player, Year, `Pts Won`, `Pts Max`, Share)
mvp_voting

Load and inspect player stats

player_stats <- read_csv('Data/player_stats.csv')
## Rows: 23881 Columns: 31
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): Player, Pos, Tm
## dbl (28): Rk, Age, G, GS, MP, FG, FGA, FG%, 3P, 3PA, 3P%, 2P, 2PA, 2P%, eFG%...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
player_stats

Remove Rank column

player_stats <- player_stats %>% select(-Rk)
player_stats

Remove asterisks after names

player_stats$Player <- str_replace_all(player_stats$Player, fixed("*"), "")
player_stats

Convert NA values for percentages to zeros. This also converts games started to zeros for those predating when that metric began being tracked. I will not use this column for my models, so it should have no impact

player_stats <- player_stats %>% mutate(across(everything(), ~ replace_na(.x, 0)))

Group the dataframe by the combined player and year. Then, handle cases where a player played for multiple teams in one season by representing only the last team played for

handle_multiple_teams <- function(df) {
  if (nrow(df) == 1) {
    return(df)
  }
  else {
    row <- df %>% filter(Tm == 'TOT')
    if (nrow(row) == 0) {
      return(df)
    }
    row$Tm <- as.character(df[nrow(df), "Tm"])
    return(row)
  }
}
player_stats$Tm <- as.character(player_stats$Tm)
player_stats <- player_stats %>% group_by(Player, Year) %>% group_modify(~ handle_multiple_teams(.x))
player_stats <- player_stats %>% ungroup()
player_stats

Merge MVP voting with player stats

player_stats_with_mvp_voting <- full_join(player_stats, mvp_voting, by = c("Player" = "Player", "Year" = "Year")) %>% mutate(
  `Pts Won` = replace_na(`Pts Won`, 0),
  `Pts Max` = replace_na(`Pts Max`, 0),
  Share = replace_na(Share, 0)
)
player_stats_with_mvp_voting

Load and inspect team stats

team_stats = read_csv('Data/team_stats.csv')
## Rows: 1254 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): GB, Team
## dbl (7): W, L, W/L%, PS/G, PA/G, SRS, Year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
team_stats

Remove asterisks and seeds from team names

team_stats$Team <- str_replace_all(team_stats$Team, fixed("*"), "")
team_stats$Team <- str_replace_all(team_stats$Team, "\\([^\\)]+\\)", "")
team_stats$Team <- str_squish(team_stats$Team)
team_stats

Change dashes for games back to zeros

team_stats <- team_stats %>% mutate(GB = str_replace_all(GB, '—', '0'))
team_stats

Convert games back from characters to numeric

team_stats <- team_stats %>% mutate(GB = as.numeric(GB))
team_stats

Load mapping from full name to abbreviation

abbreviations <- list()

lines <- read_lines("Data/abbreviations.csv")

for (line in lines[-1]) {
  split_line <- strsplit(line, ",")[[1]]
  abbreviation <- split_line[1]
  name <- split_line[2]
  
  abbreviations[[abbreviation]] <- name
}

Add full names to player stats with MVP voting

player_stats_with_mvp_voting <- player_stats_with_mvp_voting %>% mutate(Team = recode(Tm, !!!abbreviations))
player_stats_with_mvp_voting

Merge player stats with MPV voting with team stats

everything <- full_join(player_stats_with_mvp_voting, team_stats, by = c("Team" = "Team", "Year" = "Year"))
everything

Save combined stats to csv

write_csv(everything, 'Data/combined_stats.csv')